Chapter 9 Support Vector Machines

This notebook is a rough outline of solutions to the applied exercises in Chapter 9 of An Introduction to Statistical Learning (James et al. (2013)1).

Setup

library(ISLR)
library(e1071)
library(cowplot)
library(latex2exp)
library(tidyverse)
library(patchwork)
theme_set(theme_bw())

# Given a filename, return the filepath. Default folder is the `data` folder in the current working directory.
get_path <- function(filename) {
  str_c(getwd(), "data", filename, sep = "/")
}

# Load my functions:
source("/home/brian/R/dir/Util/bh_functions.R")

# Colors:
ISLR_blue <- "#176FC0"
ISLR_brown <- "#94340E"
ISLR_lightblue <- "#68A5DB"
ISLR_orange <- "#CC5318"

# Functions for plotting `svm` objects:
dist2point <- function(coef, p) {
  dist <- (coef[2]*p[1] + coef[3]*p[2] +coef[1]) / sqrt(coef[2]^2 + coef[3]^2)
  unname(dist)
}

svc_margins <- function(svc) {
  slope <- -coef(svc)[3]/coef(svc)[2] # slope
  int0 <- -coef(svc)[1] # hyperplane intercept
  v1 <- svc$SV[apply(svc$SV, 1, function(v) dist2point(coef(svc), v)) %>% which.max(), ] # support vector 1
  v2 <- svc$SV[apply(svc$SV, 1, function(v) dist2point(coef(svc), v)) %>% which.min(), ] # support vector 2

  int1 <- v1[1] - slope*v1[2] # intercept of margin 1
  int2 <- v2[1] - slope*v2[2] # intercept of margin 2

  c(slope, int0, int1, int2) %>% unname()
}

svm_plot <- function(svm, df) {
  dat <- df
  x1 <- dat[ , 1]
  x2 <- dat[ , 2]
  y <- dat[ , ncol(dat)]
  
  grid <- expand.grid(seq(min(x1), max(x1), length.out = 100),
                      seq(min(x2), max(x2), length.out = 100)) 
  names(grid) <- names(dat)[1:2]
  preds <- predict(svm, grid)
  dat_ <- data.frame(grid, preds)
  
  cols <- c('1' = '#f53224', '-1' = '#1a6eff') # 1 := red, -1 := blue; three shades darker on https://www.w3schools.com/colors/colors_picker.asp
  tiles <- c('1' = '#F8766D', '-1' = '#619CFF')
  shapes <- c('support' = 4, 'notsupport' = 1)
  dat$support <- 'notsupport'
  dat[svm$index, 'support'] <- 'support'

  g <- ggplot(dat_, aes(x = x2, y = x1)) +
          geom_tile(aes(fill = preds), color = "white", alpha = 0.3) + 
          scale_fill_manual(name = "Pred. Class", values = tiles)
  
  if(svm$kernel == 0) { # if linear kernel add margins
    margins <- svc_margins(svm)
    g <- g + 
          #geom_abline(aes(slope = margins[1], intercept = margins[2]), col = "green") + # line of separation
          geom_abline(aes(slope = margins[1], intercept = margins[4]), col = cols[1]) + #  1 := red
          geom_abline(aes(slope = margins[1], intercept = margins[3]), col = cols[2]) # -1 := blue
  }

  g <- g +
        geom_point(data = dat, aes(color = y, shape = support), size = 2, show.legend = FALSE) +
        scale_color_manual(values = cols) +
        scale_shape_manual(values = shapes) +
        scale_x_continuous(expand = c(0, 0)) +
        scale_y_continuous(expand = c(0, 0)) +
        guides(fill = guide_legend(override.aes = list(alpha = 1))) +
        theme_bw() +
        ggtitle('SVM Classification Plot') +
        theme(plot.title = element_text(hjust = 0.5))
  g
}

# Return vector of the names of numeric columns in a data frame:
numeric_cols <- function(df) {
  is_numeric_cols <- map_lgl(df, is.numeric)
  return(names(df)[is_numeric_cols])
}

Exercise 4

Generate a simulated two-class data set with 100 observations and two features in which there is a visible but non-linear separation between the two classes. Show that in this setting, a support vector machine with a polynomial kernel (with degree greater than 1) or a radial kernel will outperform a support vector classifier on the training data. Which technique performs best on the test data? Make plots and report training and test error rates in order to back up your assertions.

Generate data:

# Prelims:
set.seed(1)

# Data:
x1 <- rnorm(100)
x2 <- 4 * x1^2 + 1 + rnorm(100)

class <- sample(100, 50)
x2[class] <- x2[class] + 3
x2[-class] <- x2[-class] - 3

y <- rep(-1, 100)
y[class] <- 1

dat <- data.frame(x1 = x1,
                  x2 = x2,
                  y = as.factor(y))

# Errors:
error_train <- c()
error_test <- c()

# Train/Test
train <- sample(100, 80)
test = -train

# Plots:
ggplot(dat) +
  geom_point(aes(x1, x2, group = y, color = y)) +
  theme_bw()

Support Vector Classifier

Train a support vector classifier (linear kernel) on a range of values for cost:

set.seed(1)
tune.out_4_svc <- tune(svm, y ~ ., data = dat[train, ], kernel = "linear", ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
summary(tune.out_4_svc)

Parameter tuning of ‘svm’:

- sampling method: 10-fold cross validation 

- best parameters:

- best performance: 0.1125 

- Detailed performance results:
NA

Extract the best model and print summary:

bestmod_4_svc <- tune.out_4_svc$best.model
summary(bestmod_4_svc)

Call:
best.tune(method = svm, train.x = y ~ ., data = dat[train, ], ranges = list(cost = c(0.001, 0.01, 0.1, 1, 
    5, 10, 100)), kernel = "linear")


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  linear 
       cost:  0.1 

Number of Support Vectors:  56

 ( 28 28 )


Number of Classes:  2 

Levels: 
 -1 1

Plot the decision boundary:

#plot(bestmod_4_svc, dat[train, ])
svm_plot(bestmod_4_svc, dat[train, ])

The linear classifier fails to accurately capture the non-linearity of the decision boundary.

Training error

mean(dat[train, "y"] != predict(bestmod_4_svc, newdata = dat[train, ]))
[1] 0.1125
svc_error_train <- mean(summary(bestmod_4_svc)$fitted != y[train])
error_train <- c(error_train, "svc" = svc_error_train)
svc_error_train
[1] 0.1125

Test Error

ypred <- predict(bestmod_4_svc, dat[test, ])
table(predict = ypred, truth = y[test])
       truth
predict -1 1
     -1  8 0
     1   3 9
svc_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svc" = svc_error_test)
svc_error_test
[1] 0.15

Support Vector Machine - Polynomial Kernel (\(d\) = 2)

Train a support vector machine (polynomial kernel, \(d\) = 2) on a range of values for cost:

set.seed(1)
tune.out_4_svm <- tune(svm, y ~ ., data = dat[train, ], kernel = "polynomial", degree = 2, ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
summary(tune.out_4_svm)

Parameter tuning of ‘svm’:

- sampling method: 10-fold cross validation 

- best parameters:

- best performance: 0.2875 

- Detailed performance results:
NA

Extract the best model and print summary:

bestmod_4_svm <- tune.out_4_svm$best.model
summary(bestmod_4_svm)

Call:
best.tune(method = svm, train.x = y ~ ., data = dat[train, ], ranges = list(cost = c(0.001, 0.01, 0.1, 1, 
    5, 10, 100)), kernel = "polynomial", degree = 2)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  polynomial 
       cost:  5 
     degree:  2 
     coef.0:  0 

Number of Support Vectors:  63

 ( 32 31 )


Number of Classes:  2 

Levels: 
 -1 1

Plot the decision boundary:

#plot(bestmod_4_svm, dat[train, ])
svm_plot(bestmod_4_svm, dat[train, ])

The degree-2 polynomial kernel captures some of the non-linearity of the decision boundary, but there are many errors.

Training error

#mean(dat[train, "y"] != predict(bestmod_4_svm, newdata = dat[train, ]))
svm_error_train <- mean(summary(bestmod_4_svm)$fitted != y[train])
error_train <- c(error_train, "svm (d = 2)" = svm_error_train)
svm_error_train
[1] 0.275

Test Error

ypred <- predict(bestmod_4_svm, dat[test, ])
table(predict = ypred, truth = dat[test, "y"])
       truth
predict -1 1
     -1  6 2
     1   5 7
svm_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svm (d = 2)" = svm_error_test)
svm_error_test
[1] 0.35

Support Vector Machine - Polynomial Kernel (\(d\) = 3)

Train a support vector machine (polynomial kernel, \(d\) = 3) on a range of values for cost:

set.seed(1)
tune.out_4_svm_2 <- tune(svm, y ~ ., data = dat[train, ], kernel = "polynomial", degree = 3, ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
summary(tune.out_4_svm_2)

Parameter tuning of ‘svm’:

- sampling method: 10-fold cross validation 

- best parameters:

- best performance: 0.3875 

- Detailed performance results:
NA

Extract the best model and print summary:

bestmod_4_svm_2 <- tune.out_4_svm_2$best.model
summary(bestmod_4_svm_2)

Call:
best.tune(method = svm, train.x = y ~ ., data = dat[train, ], ranges = list(cost = c(0.001, 0.01, 0.1, 1, 
    5, 10, 100)), kernel = "polynomial", degree = 3)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  polynomial 
       cost:  1 
     degree:  3 
     coef.0:  0 

Number of Support Vectors:  75

 ( 38 37 )


Number of Classes:  2 

Levels: 
 -1 1

Plot the decision boundary:

#plot(bestmod_4_svm_2, dat[train, ])
svm_plot(bestmod_4_svm_2, dat[train, ])

The degree-3 polynomial kernel captures some of the non-linearity of the decision boundary, but there many errors. It seems that a polynomial of degree 3 is worse than that of degree 2.

Training error

mean(dat[train, "y"] != predict(bestmod_4_svm_2, newdata = dat[train, ]))
[1] 0.4125
svm_2_error_train <- mean(summary(bestmod_4_svm_2)$fitted != y[train])
error_train <- c(error_train, "svm (d = 3)" = svm_2_error_train)
svm_2_error_train
[1] 0.4125

Test Error

ypred <- predict(bestmod_4_svm_2, dat[test, ])
table(predict = ypred, truth = dat[test, "y"])
       truth
predict -1  1
     -1  1  1
     1  10  8
svm_2_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svm (d = 3)" = svm_2_error_test)
svm_2_error_test
[1] 0.55

Support Vector Machine - Radial Kernel

Train a support vector machine with a radial kernel on a range of values for cost and gamma:

set.seed(1)
tune.out_4_svm_rad <- tune(svm, y ~ ., data = dat[train, ], kernel = "radial", ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100), gamma = c(0.5, 1, 2, 3, 4)))
summary(tune.out_4_svm_rad)

Parameter tuning of ‘svm’:

- sampling method: 10-fold cross validation 

- best parameters:

- best performance: 0.025 

- Detailed performance results:
NA

Extract the best model and print summary:

bestmod_4_svm_rad <- tune.out_4_svm_rad$best.model
summary(bestmod_4_svm_rad)

Call:
best.tune(method = svm, train.x = y ~ ., data = dat[train, ], ranges = list(cost = c(0.001, 0.01, 0.1, 1, 
    5, 10, 100), gamma = c(0.5, 1, 2, 3, 4)), kernel = "radial")


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  radial 
       cost:  5 

Number of Support Vectors:  17

 ( 8 9 )


Number of Classes:  2 

Levels: 
 -1 1

Plot the decision boundary:

#plot(bestmod_4_svm_rad, dat[train, ])
svm_plot(bestmod_4_svm_rad, dat[train, ])

The radial kernel has very accurately captured the decision boundary. In fact there are no errors.

Training error

#mean(dat[train, "y"] != predict(bestmod_4_svm_rad, newdata = dat[train, ]))
svm_rad_error_train <- mean(summary(bestmod_4_svm_rad)$fitted != y[train])
error_train <- c(error_train, "svm (radial)" = svm_rad_error_train)
svm_rad_error_train
[1] 0

Test Error

ypred <- predict(bestmod_4_svm_rad, dat[test, ])
table(predict = ypred, truth = dat[test, "y"])
       truth
predict -1  1
     -1 11  1
     1   0  8
svm_rad_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svm (radial)" = svm_rad_error_test)
svm_rad_error_test
[1] 0.05

Summary of Errors

sort(error_train)
svm (radial)          svc  svm (d = 2)  svm (d = 3) 
      0.0000       0.1125       0.2750       0.4125 
sort(error_test)
svm (radial)          svc  svm (d = 2)  svm (d = 3) 
        0.05         0.15         0.35         0.55 

The SVM with radial kernel has the best training and test errors. However, the linear support vector classifier outperformed both of the SVM’s with polynomial kernels.

Redo the analysis with a more complex non-linear decision boundary to see if the polynomial kernels outperform the linear kernel:

# Prelims:
set.seed(1)

# Data:
x1 <- rnorm(100)
x2 <- 4 * x1^2 + 1 + rnorm(100)

class <- sample(100, 50)
x2[class[1:25]] <- x2[class[1:25]] + 3
x2[class[25:50]] <- x2[class[25:50]] - 10
x2[-class] <- x2[-class] - 3

y <- rep(-1, 100)
y[class] <- 1

dat <- data.frame(x1 = x1,
                  x2 = x2,
                  y = as.factor(y))

# Errors:
error_train <- c()
error_test <- c()

# Train/Test
train <- sample(100, 80)
test = -train

# Plots:
ggplot(dat) +
  geom_point(aes(x1, x2, group = y, color = y)) +
  theme_bw()

Support Vector Classifier

set.seed(1)
tune.out_4_svc <- tune(svm, y ~ ., data = dat[train, ], kernel = "linear", ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
bestmod_4_svc <- tune.out_4_svc$best.model
#plot(bestmod_4_svc, dat[train, ])
svm_plot(bestmod_4_svc, dat[train, ])

The linear classifier fails to capture any of the non-linear decision boundary.

Training error

svc_error_train <- mean(summary(bestmod_4_svc)$fitted != y[train])
error_train <- c(error_train, "svc" = svc_error_train)

Test Error

ypred <- predict(bestmod_4_svc, dat[test, ])
table(predict = ypred, truth = y[test])
       truth
predict -1  1
     -1  0  0
     1  11  9
svc_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svc" = svc_error_test)

Support Vector Machine - Polynomial Kernel (\(d\) = 2)

set.seed(1)
tune.out_4_svm <- tune(svm, y ~ ., data = dat[train, ], kernel = "polynomial", degree = 2, ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
bestmod_4_svm <- tune.out_4_svm$best.model
#plot(bestmod_4_svm, dat[train, ])
svm_plot(bestmod_4_svm, dat[train, ])

The degree-2 polynomial kernel captures some of the non-linearity of the decision boundary, but there many errors.

Training error

#mean(dat[train, "y"] != predict(bestmod_4_svm, newdata = dat[train, ]))
svm_error_train <- mean(summary(bestmod_4_svm)$fitted != y[train])
error_train <- c(error_train, "svm (d = 2)" = svm_error_train)

Test Error

ypred <- predict(bestmod_4_svm, dat[test, ])
table(predict = ypred, truth = dat[test, "y"])
       truth
predict -1  1
     -1 10  4
     1   1  5
svm_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svm (d = 2)" = svm_error_test)

Support Vector Machine (Polynomial Kernel, \(d\) = 3)

set.seed(1)
tune.out_4_svm_2 <- tune(svm, y ~ ., data = dat[train, ], kernel = "polynomial", degree = 3, ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
bestmod_4_svm_2 <- tune.out_4_svm_2$best.model
#plot(bestmod_4_svm_2, dat[train, ])
svm_plot(bestmod_4_svm_2, dat[train, ])

The degree-3 polynomial kernel captures some of the non-linearity of the decision boundary, but there many errors.

Training error

svm_2_error_train <- mean(summary(bestmod_4_svm_2)$fitted != y[train])
error_train <- c(error_train, "svm (d = 3)" = svm_2_error_train)

Test Error

ypred <- predict(bestmod_4_svm_2, dat[test, ])
svm_2_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svm (d = 3)" = svm_2_error_test)

Support Vector Machine - Radial Kernel

set.seed(1)
tune.out_4_svm_rad <- tune(svm, y ~ ., data = dat[train, ], kernel = "radial", ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100), gamma = c(0.5, 1, 2, 3, 4)))
bestmod_4_svm_rad <- tune.out_4_svm_rad$best.model
#plot(bestmod_4_svm_rad, dat[train, ])
svm_plot(bestmod_4_svm_rad, dat[train, ])

Training error

svm_rad_error_train <- mean(summary(bestmod_4_svm_rad)$fitted != y[train])
error_train <- c(error_train, "svm (radial)" = svm_rad_error_train)

Test Error

ypred <- predict(bestmod_4_svm_rad, dat[test, ])
table(predict = ypred, truth = dat[test, "y"])
       truth
predict -1  1
     -1 11  1
     1   0  8
svm_rad_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svm (radial)" = svm_rad_error_test)

Summary of Errors

sort(error_train)
svm (radial)  svm (d = 2)  svm (d = 3)          svc 
      0.0125       0.2750       0.3500       0.4875 
sort(error_test)
svm (radial)  svm (d = 2)  svm (d = 3)          svc 
        0.05         0.25         0.40         0.55 

The SVM with radial kernel has the best training and test errors. This time, with a more complex decision boundary, both of the SVM’s with polynomial kernels outperformed the linear support vector classifier.


Exercise 5

  (a)

Generate a data set with \(n = 500\) and \(p = 2\), such that the observations belong to two classes with a quadratic decision boundary between them:

set.seed(1)
x1 <- runif(500) - 0.5
x2 <- runif(500) - 0.5
y <- 1*((x1^2 - x2^2) > 0)
df <- data.frame(x1, x2, y = as.factor(y)) 

  (b)

Plot the observations, colored according to their class labels:

g0 <- ggplot(df) +
  geom_point(aes(x1, x2, group = y, color = y)) +
  theme_bw() +
  ggtitle("True Class Labels") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_discrete(name = "True Class")
g0

  (c)

Use all of the data as the training data

Fit a logistic regression model to the data, using \(X_1\) and \(X_2\) as predictors:

logreg <- glm(y ~ x1 + x2, data = df, family = "binomial") 
probs <- predict(logreg, newdata = df, type = "response")
preds <- rep(0, 500)
preds[probs > 0.5] <- 1

  (d)

Apply this model to the training data in order to obtain a predicted class label for each training observation. Plot the observations, colored according to the predicted class labels. The decision boundary should be linear:

df$logreg_preds <- as.factor(preds)
g1 <- ggplot(df) +
  geom_point(aes(x1, x2, group = logreg_preds, color = logreg_preds)) +
  theme_bw() +
  scale_color_discrete(name = "Pred. Class") + 
  ggtitle("Logistic Regression: y ~ x1 + x2") +
  theme(plot.title = element_text(hjust = 0.5))
cowplot::plot_grid(g0, g1, nrow = 1)

  (e)

Now fit a logistic regression model to the data using non-linear functions of \(X_1\) and \(X_2\) as predictors:

formulas <- c("y ~ I(x1^2) + I(x2^2)",
              "y ~ I(x1^0.5) + I(x2^0.5)",
              "y ~ x1 + x2 + I(x1^2) + I(x2^2)",
              "y ~ x1 + x2 + I(x1^0.5) + I(x2^0.5)",
              "y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1 * x2)",
              "y ~ x1 + x2 + I(x1^0.5) + I(x2^0.5) + I(x1 * x2)"
)

logregs <- map(formulas, ~glm(as.formula(.x), data = df, family = "binomial"))
probs <- map(logregs, ~predict(.x, newdata = df, type = "response"))
preds <- vector("list", length(formulas))
preds <- map(1:length(formulas), function(x) preds[[x]] = rep(0, 500))
for(i in 1:length(formulas)) {
  preds[[i]][probs[[i]] > 0.5] <- 1
}

  (f)

With the models involving the squares of the predictors the predicted results look quite close to the true values. The models involving the square roots do not predict the classes well:

G2 <- vector("list", length(formulas))
for(i in 1:length(formulas)) {
  df_ <- data.frame(x1, x2, preds = as.factor(preds[[i]]))
  G2[[i]] <- ggplot(df_) +
                  geom_point(aes(x1, x2,
                                 group = preds,
                                 color = preds)) +
                  ggtitle(label = paste0(formulas[i])) +
                  theme_bw() +
                  theme(plot.title = element_text(hjust = 0.5)) +
    scale_color_discrete(name = "Pred. Class")
}

cowplot::plot_grid(plotlist = G2, nrow = 3)

  (g)

Fit a support vector classifier to the data with \(X_1\) and \(X_2\) as predictors:

svc.out <- tune(svm, y ~ ., data = df, kernel = "linear", ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
svc <- svc.out$best.model
preds <- predict(svc, df)
df_ <- data.frame(x1, x2, preds = as.factor(preds))
g3 <- ggplot(df_) +
  geom_point(aes(x1, x2,
             group = preds,
             color = preds)) +
      ggtitle("Support Vector Classifier") +
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5)) +
      scale_color_discrete(name = "Pred. Class")
cowplot::plot_grid(g0, g3, nrow = 1)

  (h)

Fit a SVM using a non-linear kernel to the data.

Support vector machine with polynomial kernel, \(d\) = 2

svm.out <- tune(svm, y ~ ., data = df, kernel = "polynomial", degree = 2, ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
svm1 <- svm.out$best.model
preds <- predict(svm1, df)
df_ <- data.frame(x1, x2, preds = as.factor(preds))
g4 <- ggplot(df_) +
  geom_point(aes(x1, x2,
             group = preds,
             color = preds)) +
      ggtitle("Support Vector Machine - Polynomial Kernel (d = 2)") +
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5, size = 10)) +
      scale_color_discrete(name = "Pred. Class")
cowplot::plot_grid(g0, g4, nrow = 1)

Support vector machine with radial kernel

svm.out2 <- tune(svm, y ~ ., data = df, kernel = "radial", degree = 2, ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100), gamma = c(0.5, 1, 2, 3, 4)))
svm_rad <- svm.out$best.model
preds <- predict(svm_rad, df)
df_ <- data.frame(x1, x2, preds = as.factor(preds))
g5 <- ggplot(df_) +
  geom_point(aes(x1, x2,
             group = preds,
             color = preds)) +
      ggtitle("Support Vector Machine - Radial Kernel") +
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5, size = 12)) +
      scale_color_discrete(name = "Pred. Class")
cowplot::plot_grid(g0, g5, nrow = 1)

  (i)

The logistic regression model with linear combination of the variables and the support vector classifier both performed poorly. The logistic regression model with non-linear transformations of the variables, as well as support vector machines with polynomial and radial kernels all performed well.

g2 <- G2[[1]] + ggtitle("Logistic Regression: y ~ I(x1^2) + I(x2^2)")
cowplot::plot_grid(g0, g1, g2, g3, g4, g5, nrow = 3)


Exercise 6

It’s hard to figure out exactly what the best type of data is for this question. Most of the data sets tried result in zero test error as the cost increases. Using the below data set with a modest \(n\) of 100 seems to work for what the question asks, i.e. the test error gets to a minimum before rising again as the cost increases.

  (a)

# Prelim:
set.seed(1)

# Data:
n <- 100
x1 <- runif(n, max = 0.3, min = -0.3)
x2 <- rnorm(n, sd = 1)
y <- ifelse(-0.2 * x1 + x2 > 0, 1, -1)
df <- data.frame(x1, x2, y = as.factor(y))

# Train/Test:
train <- sample(n, 4*n/5)
test <- -train

# Plot:
ggplot(df) +
  geom_point(aes(x1, x2, group = y, color = y)) +
  theme_bw() +
  ggtitle("True Class Labels") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_discrete(name = "True Class", breaks = c(1, -1))

  (b) and (c)

Generate support vector classifiers via cross validation using a range of values for cost (use all of the data for CV, then we will use the training/test splits in the next section):

costs <- c (0.001, 0.01, 0.1, 1, 5, 10, 100, 1e3, 1e4, 1e5)
svc.out <- tune(svm, y ~ ., data = df, kernel = "linear", ranges = list(cost = costs))
svc.cv.error <- svc.out$performances$error # extract the CV errors
svc.cv.error
 [1] 0.44 0.25 0.05 0.06 0.06 0.05 0.04 0.03 0.02 0.02

The training and test errors can be obtained by training SVC models using a range of values for cost and then making predictions on the training and test data respectively:

svc.train.error <- vector("double", length(costs))
svc.test.error <- vector("double", length(costs))
G <- vector("list", length(costs))
for(i in 1:length(costs)) {
  svc <- svm(y ~ ., data = df[train, ], kernel = "linear", cost = costs[i])
  G[[i]] <- svm_plot(svc, df[train, ]) + ggtitle(str_c("cost: ", costs[i]))
  
  # Training Error:
  preds <- predict(svc, df[train, ])
  train.error <- mean(preds != y[train])
  svc.train.error[i] <- train.error
  
  # Test Error:
  preds <- predict(svc, df[test, ])
  test.error <- mean(preds != y[test])
  svc.test.error[i] <- test.error
}

svc.train.error
 [1] 0.4375 0.2500 0.1000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
svc.test.error
 [1] 0.45 0.35 0.00 0.05 0.05 0.05 0.05 0.05 0.05 0.05
plot_grid(plotlist = G, nrow = 5)

Plot the CV, training, and test errors:

library(latex2exp)
svc.errors <- data.frame(cost = log10(costs),
                         cv.error = svc.cv.error,
                         train.error = svc.train.error,
                         test.error = svc.test.error)

# see https://stackoverflow.com/a/50496899 and https://stackoverflow.com/a/10355844 for making legend
ggplot(svc.errors) +
  geom_line(aes(cost, cv.error, color = "CV")) +
  geom_line(aes(cost, train.error, color = "Train")) +
  geom_line(aes(cost, test.error, color = "Test")) +
  geom_hline(aes(yintercept = min(svc.test.error)), color = "grey30", linetype = "dashed") +
  scale_color_manual(name = "Error Rate", breaks = c("Test", "Train", "CV"), values = c("Test" = "#00BA38", "CV" = "#F8766D", "Train" = "#619CFF")) +
  #scale_color_discrete(name = NULL, labels = c("CV Error", "Test Error", "Training Error")) +
  labs(title = "Comparison of CV, Training and Test Error Rates",
       x = TeX("Log_{10}(Cost)"),
       y = "Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = c(0.97, 0.95), legend.justification = c(1, 1))

  (d)

The error rates approach zero as the cost increases from small values towards 1. Then, the test error starts to rise again, suggesting that a lower value for cost is better, even though the training error is zero for larger values.


Exercise 7

View data:

Auto

Summary:

summary(Auto)
      mpg          cylinders      displacement     horsepower        weight      acceleration        year           origin     
 Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0   Min.   :1613   Min.   : 8.00   Min.   :70.00   Min.   :1.000  
 1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0   1st Qu.:2225   1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000  
 Median :22.75   Median :4.000   Median :151.0   Median : 93.5   Median :2804   Median :15.50   Median :76.00   Median :1.000  
 Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5   Mean   :2978   Mean   :15.54   Mean   :75.98   Mean   :1.577  
 3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0   3rd Qu.:3615   3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000  
 Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0   Max.   :5140   Max.   :24.80   Max.   :82.00   Max.   :3.000  
                                                                                                                               
                 name    
 amc matador       :  5  
 ford pinto        :  5  
 toyota corolla    :  5  
 amc gremlin       :  4  
 amc hornet        :  4  
 chevrolet chevette:  4  
 (Other)           :365  
Auto$origin <- as.factor(Auto$origin)
Auto$year <- factor(Auto$year, ordered = TRUE)
head(Auto)

  (a)

Create a binary variable that takes on a 1 for cars with gas mileage above the median, and a 0 for cars with gas mileage below the median:

Auto$med01 <- as.factor(Auto$mpg > median(Auto$mpg))
Auto <- Auto[ , -1]
summary(Auto$med01)
FALSE  TRUE 
  196   196 

  (b)

Fit a support vector classifier to the data with various values of cost, in order to predict whether a car gets high or low gas mileage:

set.seed(1)
costs <- c (0.001, 0.01, 0.1, 1, 5, 10, 50, 100, 250, 500, 1e3, 5e3, 1e4, 5e4, 1e5)
svc.out <- tune(svm, med01 ~ ., data = Auto, kernel = "linear", ranges = list(cost = costs))
svc.cv.error <- svc.out$performances$error # extract the CV errors
svc.cv.error
 [1] 0.18141026 0.10211538 0.09955128 0.09192308 0.09442308 0.09955128 0.14032051 0.14288462 0.15570513 0.15570513 0.15570513
[12] 0.15570513 0.15570513 0.15570513 0.15570513

Plot CV errors:

ggplot(data.frame(cost = log10(costs), cv.error = svc.cv.error)) +
  geom_line(aes(cost, cv.error), color = ISLR_orange) +
  labs(title = "CV Error Rate as Function of Cost",
       x = TeX("Log_{10}(Cost)"),
       y = "CV Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

  (c)

Now repeat (b), this time using SVMs with radial and polynomial basis kernels, with different values of gamma and degree and cost.

Polynomial Kernel

set.seed(1)
costs <- c (0.001, 0.01, 0.1, 1, 5, 10, 50, 100, 250, 500, 1e3, 5e3, 1e4, 5e4, 1e5)
degrees <- c(2, 3, 4, 5)
svc.out.poly <- tune(svm, med01 ~ ., data = Auto, kernel = "polynomial", ranges = list(degree = degrees, cost = costs)) # first variable in ranges is the rows in below matrix
svc.poly.cv.error <- svc.out.poly$performances$error # extract the CV errors
round(matrix(svc.poly.cv.error, nrow = length(degrees)), 3)
      [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10] [,11] [,12] [,13] [,14] [,15]
[1,] 0.551 0.551 0.551 0.551 0.551 0.549 0.352 0.281 0.250 0.228 0.135 0.097 0.082 0.089 0.089
[2,] 0.551 0.551 0.551 0.551 0.551 0.551 0.551 0.454 0.368 0.312 0.276 0.255 0.258 0.123 0.120
[3,] 0.551 0.551 0.551 0.551 0.551 0.551 0.551 0.551 0.551 0.551 0.551 0.498 0.470 0.383 0.350
[4,] 0.551 0.551 0.551 0.551 0.551 0.551 0.551 0.551 0.551 0.551 0.551 0.551 0.551 0.503 0.493

Plot CV errors:

dat_ <- data.frame(cbind(cost = log10(costs), t(matrix(svc.poly.cv.error, nrow = length(degrees))))) %>% 
        gather(key = "degree", value = "cv.error", V2:V5) %>% 
        mutate(degree = factor(str_replace(degree, "V", "degree: ")))

ggplot(dat_) +
  geom_line(aes(cost, cv.error, color = degree)) +
  labs(title = "Polynomial Kernel - CV Error Rate as Function of Cost",
       x = TeX("Log_{10}(Cost)"),
       y = "CV Error Rate") +
  ggsci::scale_color_d3() +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

Radial Kernel

set.seed(1)
costs <- c(0.001, 0.01, 0.1, 1, 5, 10, 50, 100, 250, 500, 1e3, 5e3, 1e4, 5e4, 1e5)
gammas = c(0.01, 0.05, 0.1, 0.5)
svc.out.rad <- tune(svm, med01 ~ ., data = Auto, kernel = "polynomial", ranges = list(gamma = gammas, cost = costs)) # first variable in ranges is the rows in below matrix
svc.rad.cv.error <- svc.out.rad$performances$error # extract the CV errors
round(matrix(svc.rad.cv.error, nrow = length(gammas)), 3)
      [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8]  [,9] [,10] [,11] [,12] [,13] [,14] [,15]
[1,] 0.551 0.551 0.551 0.551 0.408 0.350 0.260 0.261 0.258 0.240 0.192 0.120 0.097 0.084 0.082
[2,] 0.551 0.551 0.337 0.255 0.230 0.143 0.110 0.097 0.084 0.084 0.079 0.079 0.079 0.079 0.079
[3,] 0.551 0.350 0.261 0.192 0.120 0.097 0.084 0.082 0.079 0.079 0.079 0.079 0.079 0.079 0.079
[4,] 0.255 0.143 0.097 0.079 0.079 0.079 0.079 0.079 0.079 0.079 0.079 0.079 0.079 0.079 0.079

Plot CV errors:

dat_ <- data.frame(cbind(cost = log10(costs), t(matrix(svc.rad.cv.error, nrow = length(gammas))))) %>% 
        gather(key = "gamma", value = "cv.error", V2:V5) %>% 
        mutate(gamma = str_c("gamma: ", factor(gammas[as.numeric(str_remove(gamma, "V")) - 1]))) # factor to make it a discrete var in ggplot()

ggplot(dat_) +
  geom_line(aes(cost, cv.error, color = gamma)) +
  labs(title = "Radial Kernel - CV Error Rate as Function of Cost",
       x = TeX("Log_{10}(Cost)"),
       y = "CV Error Rate") +
  ggsci::scale_color_d3() +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

  (d)

A support vector machine with a linear kernel and cost = 0.1 performs well, with a CV error of 0.087

A support vector machine with a polynomial kernel of degree three and also achieves a low CV error of 0.08. However the cost is very large at 100,000.

A support vector machine with a radial kernel and parameters of cost = 1 and gamma = 0.5 has the lowest CV error at 0.089. Other radial kernel SVM’s with very large cost values also have an equally low CV error, but a gamma value of 0.5 performs well with lower cost which would likely overfit less than the other models.


Exercise 8

View data:

head(OJ)

Summary:

summary(OJ)
 Purchase WeekofPurchase     StoreID        PriceCH         PriceMM          DiscCH            DiscMM         SpecialCH     
 CH:653   Min.   :227.0   Min.   :1.00   Min.   :1.690   Min.   :1.690   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
 MM:417   1st Qu.:240.0   1st Qu.:2.00   1st Qu.:1.790   1st Qu.:1.990   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
          Median :257.0   Median :3.00   Median :1.860   Median :2.090   Median :0.00000   Median :0.0000   Median :0.0000  
          Mean   :254.4   Mean   :3.96   Mean   :1.867   Mean   :2.085   Mean   :0.05186   Mean   :0.1234   Mean   :0.1477  
          3rd Qu.:268.0   3rd Qu.:7.00   3rd Qu.:1.990   3rd Qu.:2.180   3rd Qu.:0.00000   3rd Qu.:0.2300   3rd Qu.:0.0000  
          Max.   :278.0   Max.   :7.00   Max.   :2.090   Max.   :2.290   Max.   :0.50000   Max.   :0.8000   Max.   :1.0000  
   SpecialMM         LoyalCH          SalePriceMM     SalePriceCH      PriceDiff       Store7      PctDiscMM     
 Min.   :0.0000   Min.   :0.000011   Min.   :1.190   Min.   :1.390   Min.   :-0.6700   No :714   Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:0.325257   1st Qu.:1.690   1st Qu.:1.750   1st Qu.: 0.0000   Yes:356   1st Qu.:0.0000  
 Median :0.0000   Median :0.600000   Median :2.090   Median :1.860   Median : 0.2300             Median :0.0000  
 Mean   :0.1617   Mean   :0.565782   Mean   :1.962   Mean   :1.816   Mean   : 0.1465             Mean   :0.0593  
 3rd Qu.:0.0000   3rd Qu.:0.850873   3rd Qu.:2.130   3rd Qu.:1.890   3rd Qu.: 0.3200             3rd Qu.:0.1127  
 Max.   :1.0000   Max.   :0.999947   Max.   :2.290   Max.   :2.090   Max.   : 0.6400             Max.   :0.4020  
   PctDiscCH       ListPriceDiff       STORE      
 Min.   :0.00000   Min.   :0.000   Min.   :0.000  
 1st Qu.:0.00000   1st Qu.:0.140   1st Qu.:0.000  
 Median :0.00000   Median :0.240   Median :2.000  
 Mean   :0.02731   Mean   :0.218   Mean   :1.631  
 3rd Qu.:0.00000   3rd Qu.:0.300   3rd Qu.:3.000  
 Max.   :0.25269   Max.   :0.440   Max.   :4.000  

  (a)

Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations:

# Prelims:
set.seed(1)

# Data:
df <- OJ

# Train/Test:
train <- sample(1:nrow(OJ), 800)
test <- -train

  (b)

Fit a support vector classifier to the training data using cost = 0.01, with Purchase as the response and the other variables as predictors:

svc_8 <- svm(Purchase ~ . , data = df[train, ], kernel = "linear", cost = 0.01)
summary(svc_8)

Call:
svm(formula = Purchase ~ ., data = df[train, ], kernel = "linear", cost = 0.01)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  linear 
       cost:  0.01 

Number of Support Vectors:  435

 ( 219 216 )


Number of Classes:  2 

Levels: 
 CH MM

  (c)

Training error:

svc_8.train.error <- mean(predict(svc_8, df[train, ]) != df$Purchase[train])
svc_8.train.error
[1] 0.175

Test error:

svc_8.test.error <- mean(predict(svc_8, df[test, ]) != df$Purchase[test])
svc_8.test.error
[1] 0.1777778

  (d)

Use the tune() function to select an optimal cost. Consider values in the range 0.01 to 10.

set.seed(1)
costs <- c (0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10)
svc.out_8 <- tune(svm, Purchase ~ ., data = df[train, ], kernel = "linear", ranges = list(cost = costs))
svc_8.cv.error <- svc.out_8$performances$error # extract the CV errors
svc_8.cv.error
[1] 0.17625 0.17625 0.17250 0.17125 0.16875 0.17500 0.17250 0.17250 0.17375

Plot:

ggplot(data.frame(cost = log10(costs), cv.error = svc_8.cv.error)) +
  geom_line(aes(cost, cv.error), color = ISLR_orange) +
  labs(title = "Linear Kernel - CV Error Rate as Function of Cost",
       x = TeX("Log_{10}(Cost)"),
       y = "CV Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

  (e)

The training and test errors can be obtained by training SVC models using a range of values for cost and then making predictions on the training and test data respectively:

set.seed(1)
svc_8.train.error <- vector("double", length(costs))
svc_8.test.error <- vector("double", length(costs))
for(i in 1:length(costs)) {
  svc <- svm(Purchase ~ ., data = df[train, ], kernel = "linear", cost = costs[i])
  
  # Training Error:
  preds <- predict(svc, df[train, ])
  train.error <- mean(preds != df$Purchase[train])
  svc_8.train.error[i] <- train.error
  
  # Test Error:
  preds <- predict(svc, df[test, ])
  test.error <- mean(preds != df$Purchase[test])
  svc_8.test.error[i] <- test.error
}

svc_8.train.error
[1] 0.17500 0.17000 0.16500 0.16500 0.16500 0.16375 0.16500 0.16625 0.16375
svc_8.test.error
[1] 0.1777778 0.1629630 0.1629630 0.1592593 0.1555556 0.1555556 0.1518519 0.1555556 0.1481481

Plot the CV, training, and test errors:

library(latex2exp)
svc_8.errors <- data.frame(cost = log10(costs),
                         cv.error = svc_8.cv.error,
                         train.error = svc_8.train.error,
                         test.error = svc_8.test.error)

# see https://stackoverflow.com/a/50496899 and https://stackoverflow.com/a/10355844 for making legend
g1 <- ggplot(svc_8.errors) +
  geom_line(aes(cost, cv.error, color = "CV")) +
  geom_line(aes(cost, train.error, color = "Train")) +
  geom_line(aes(cost, test.error, color = "Test")) +
  scale_color_manual(name = "Error Rate", breaks = c("Test", "Train", "CV"), values = c("Test" = "#00BA38", "CV" = "#F8766D", "Train" = "#619CFF")) +
  #scale_color_discrete(name = NULL, labels = c("CV Error", "Test Error", "Training Error")) +
  labs(title = "Linear Kernel - Comparison of CV, Training and Test Error Rates",
       x = TeX("Log_{10}(Cost)"),
       y = "Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = c(0.97, 0.95), legend.justification = c(1, 1))
g1

The training error seems to be higher than the test error.

  (f)

Radial Kernel

set.seed(1)
svm_8_rad <- svm(Purchase ~ . , data = df[train, ], kernel = "radial", cost = 0.01)
summary(svm_8_rad)

Call:
svm(formula = Purchase ~ ., data = df[train, ], kernel = "radial", cost = 0.01)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  radial 
       cost:  0.01 

Number of Support Vectors:  634

 ( 319 315 )


Number of Classes:  2 

Levels: 
 CH MM

Training error:

svm_8_rad.train.error <- mean(predict(svm_8_rad, df[train, ]) != df$Purchase[train])
svm_8_rad.train.error
[1] 0.39375

Test error:

svm_8_rad.test.error <- mean(predict(svm_8_rad, df[test, ]) != df$Purchase[test])
svm_8_rad.test.error
[1] 0.3777778

Use CV:

set.seed(1)
svm.out_8_rad <- tune(svm, Purchase ~ ., data = df[train, ], kernel = "radial", ranges = list(cost = costs))
svm_8_rad.cv.error <- svm.out_8_rad$performances$error # extract the CV errors
svm_8_rad.cv.error
[1] 0.39375 0.20250 0.18625 0.18000 0.16750 0.17125 0.17750 0.18000 0.18625

Plot CV error:

ggplot(data.frame(cost = log10(costs), cv.error = svm_8_rad.cv.error)) +
  geom_line(aes(cost, cv.error), color = ISLR_orange) +
  labs(title = "Radial Kernel - CV Error Rate as Function of Cost",
       x = TeX("Log_{10}(Cost)"),
       y = "CV Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

The training and test errors can be obtained by training SVM models using a range of values for cost and then making predictions on the training and test data respectively:

set.seed(1)
svm_8_rad.train.error <- vector("double", length(costs))
svm_8_rad.test.error <- vector("double", length(costs))
for(i in 1:length(costs)) {
  svm_8 <- svm(Purchase ~ ., data = df[train, ], kernel = "radial", cost = costs[i])
  
  # Training Error:
  preds <- predict(svm_8, df[train, ])
  train.error <- mean(preds != df$Purchase[train])
  svm_8_rad.train.error[i] <- train.error
  
  # Test Error:
  preds <- predict(svm_8, df[test, ])
  test.error <- mean(preds != df$Purchase[test])
  svm_8_rad.test.error[i] <- test.error
}

svm_8_rad.train.error
[1] 0.39375 0.18250 0.17375 0.16000 0.14750 0.15125 0.14750 0.14625 0.14500
svm_8_rad.test.error
[1] 0.3777778 0.2333333 0.2037037 0.1962963 0.1777778 0.1851852 0.1814815 0.1777778 0.1851852

Plot the CV, training, and test errors:

library(latex2exp)
svm_8_rad.errors <- data.frame(cost = log10(costs),
                         cv.error = svm_8_rad.cv.error,
                         train.error = svm_8_rad.train.error,
                         test.error = svm_8_rad.test.error)

# see https://stackoverflow.com/a/50496899 and https://stackoverflow.com/a/10355844 for making legend
g2 <- ggplot(svm_8_rad.errors) +
  geom_line(aes(cost, cv.error, color = "CV")) +
  geom_line(aes(cost, train.error, color = "Train")) +
  geom_line(aes(cost, test.error, color = "Test")) +
  scale_color_manual(name = "Error Rate", breaks = c("Test", "Train", "CV"), values = c("Test" = "#00BA38", "CV" = "#F8766D", "Train" = "#619CFF")) +
  #scale_color_discrete(name = NULL, labels = c("CV Error", "Test Error", "Training Error")) +
  labs(title = "Radial Kernel - Comparison of CV, Training and Test Error Rates",
       x = TeX("Log_{10}(Cost)"),
       y = "Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = c(0.97, 0.95), legend.justification = c(1, 1))
g2

With the radial kernel, the training error is lower than the test error.

  (g)

Polynomial Kernel

set.seed(1)
svm_8_poly <- svm(Purchase ~ . , data = df[train, ], kernel = "polynomial", degree = 2, cost = 0.01)
summary(svm_8_poly)

Call:
svm(formula = Purchase ~ ., data = df[train, ], kernel = "polynomial", degree = 2, cost = 0.01)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  polynomial 
       cost:  0.01 
     degree:  2 
     coef.0:  0 

Number of Support Vectors:  636

 ( 321 315 )


Number of Classes:  2 

Levels: 
 CH MM

Training error:

svm_8_poly.train.error <- mean(predict(svm_8_poly, df[train, ]) != df$Purchase[train])
svm_8_poly.train.error
[1] 0.3725

Test error:

svm_8_poly.test.error <- mean(predict(svm_8_poly, df[test, ]) != df$Purchase[test])
svm_8_poly.test.error
[1] 0.3666667

Use CV:

set.seed(1)
svm.out_8_poly <- tune(svm, Purchase ~ ., data = df[train, ], kernel = "polynomial", degree = 2, ranges = list(cost = costs))

Plot CV error:

ggplot(data.frame(cost = log10(costs), cv.error = svm_8_poly.cv.error)) +
  geom_line(aes(cost, cv.error), color = ISLR_orange) +
  labs(title = "Polynomial Kernel (d = 2) - CV Error Rate as Function of Cost",
       x = TeX("Log_{10}(Cost)"),
       y = "CV Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

The training and test errors can be obtained by training SVC models using a range of values for cost and then making predictions on the training and test data respectively:

set.seed(1)
svm_8_poly.train.error <- vector("double", length(costs))
svm_8_poly.test.error <- vector("double", length(costs))
for(i in 1:length(costs)) {
  svm_8 <- svm(Purchase ~ ., data = df[train, ], kernel = "polynomial", degree = 2, cost = costs[i])
  
  # Training Error:
  preds <- predict(svm_8, df[train, ])
  train.error <- mean(preds != df$Purchase[train])
  svm_8_poly.train.error[i] <- train.error
  
  # Test Error:
  preds <- predict(svm_8, df[test, ])
  test.error <- mean(preds != df$Purchase[test])
  svm_8_poly.test.error[i] <- test.error
}

svm_8_poly.train.error
[1] 0.37250 0.33125 0.30625 0.19125 0.18750 0.18250 0.16000 0.15500 0.15000
svm_8_poly.test.error
[1] 0.3666667 0.3074074 0.2962963 0.2333333 0.2259259 0.2222222 0.2111111 0.1814815 0.1888889

Plot the CV, training, and test errors:

library(latex2exp)
svm_8_poly.errors <- data.frame(cost = log10(costs),
                         cv.error = svm_8_poly.cv.error,
                         train.error = svm_8_poly.train.error,
                         test.error = svm_8_poly.test.error)

# see https://stackoverflow.com/a/50496899 and https://stackoverflow.com/a/10355844 for making legend
g3 <- ggplot(svm_8_poly.errors) +
  geom_line(aes(cost, cv.error, color = "CV")) +
  geom_line(aes(cost, train.error, color = "Train")) +
  geom_line(aes(cost, test.error, color = "Test")) +
  scale_color_manual(name = "Error Rate", breaks = c("Test", "Train", "CV"), values = c("Test" = "#00BA38", "CV" = "#F8766D", "Train" = "#619CFF")) +
  #scale_color_discrete(name = NULL, labels = c("CV Error", "Test Error", "Training Error")) +
  labs(title = "Polynomial Kernel (d = 2) - Comparison of CV, Training and Test Error Rates",
       x = TeX("Log_{10}(Cost)"),
       y = "Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = c(0.97, 0.95), legend.justification = c(1, 1))
g3

With the radial kernel, the training error is lower than the test error.

  (h)

Comparison of three kernels:

plot_grid(plotlist = list(g1 + ggtitle("Linear") + scale_y_continuous(limits = c(0.14, 0.4), breaks = seq(0.1, 0.4, by = 0.05)),
                          g2 + ggtitle("Radial") + scale_y_continuous(limits = c(0.14, 0.4), breaks = seq(0.1, 0.4, by = 0.05)) + theme(axis.title.y = element_blank()),
                          g3 + ggtitle("Polynomial") + scale_y_continuous(limits = c(0.14, 0.4), breaks = seq(0.1, 0.4, by = 0.05)) + theme(axis.title.y = element_blank())), nrow = 1)


  1. James G, Witten D, Hastie T, Tibshirani R (2013). An Introduction to Statistical Learning: with Applications in R. Springer, 1st edition.↩︎

---
title: "An Introduction to Statistical Learning (2013)"
subtitle: "Solutions to Applied Exercises"
author: "by Brian Hassett"
output: 
  html_notebook: 
    toc: yes
    toc_depth: 2
---

# Chapter 9 Support Vector Machines

This notebook is a rough outline of solutions to the applied exercises in Chapter 9 of **An Introduction to Statistical Learning** (James et al. (2013)[^1]).

[^1]: James G, Witten D, Hastie T, Tibshirani R (2013). *An Introduction to Statistical Learning: with Applications in R*. Springer, 1st edition.

## Setup
```{r setup, message=FALSE}
library(ISLR)
library(e1071)
library(cowplot)
library(latex2exp)
library(tidyverse)
library(patchwork)
theme_set(theme_bw())

# Given a filename, return the filepath. Default folder is the `data` folder in the current working directory.
get_path <- function(filename) {
  str_c(getwd(), "data", filename, sep = "/")
}

# Load my functions:
source("/home/brian/R/dir/Util/bh_functions.R")

# Colors:
ISLR_blue <- "#176FC0"
ISLR_brown <- "#94340E"
ISLR_lightblue <- "#68A5DB"
ISLR_orange <- "#CC5318"

# Functions for plotting `svm` objects:
dist2point <- function(coef, p) {
  dist <- (coef[2]*p[1] + coef[3]*p[2] +coef[1]) / sqrt(coef[2]^2 + coef[3]^2)
  unname(dist)
}

svc_margins <- function(svc) {
  slope <- -coef(svc)[3]/coef(svc)[2] # slope
  int0 <- -coef(svc)[1] # hyperplane intercept
  v1 <- svc$SV[apply(svc$SV, 1, function(v) dist2point(coef(svc), v)) %>% which.max(), ] # support vector 1
  v2 <- svc$SV[apply(svc$SV, 1, function(v) dist2point(coef(svc), v)) %>% which.min(), ] # support vector 2

  int1 <- v1[1] - slope*v1[2] # intercept of margin 1
  int2 <- v2[1] - slope*v2[2] # intercept of margin 2

  c(slope, int0, int1, int2) %>% unname()
}

svm_plot <- function(svm, df) {
  dat <- df
  x1 <- dat[ , 1]
  x2 <- dat[ , 2]
  y <- dat[ , ncol(dat)]
  
  grid <- expand.grid(seq(min(x1), max(x1), length.out = 100),
                      seq(min(x2), max(x2), length.out = 100)) 
  names(grid) <- names(dat)[1:2]
  preds <- predict(svm, grid)
  dat_ <- data.frame(grid, preds)
  
  cols <- c('1' = '#f53224', '-1' = '#1a6eff') # 1 := red, -1 := blue; three shades darker on https://www.w3schools.com/colors/colors_picker.asp
  tiles <- c('1' = '#F8766D', '-1' = '#619CFF')
  shapes <- c('support' = 4, 'notsupport' = 1)
  dat$support <- 'notsupport'
  dat[svm$index, 'support'] <- 'support'

  g <- ggplot(dat_, aes(x = x2, y = x1)) +
          geom_tile(aes(fill = preds), color = "white", alpha = 0.3) + 
          scale_fill_manual(name = "Pred. Class", values = tiles)
  
  if(svm$kernel == 0) { # if linear kernel add margins
    margins <- svc_margins(svm)
    g <- g + 
          #geom_abline(aes(slope = margins[1], intercept = margins[2]), col = "green") + # line of separation
          geom_abline(aes(slope = margins[1], intercept = margins[4]), col = cols[1]) + #  1 := red
          geom_abline(aes(slope = margins[1], intercept = margins[3]), col = cols[2]) # -1 := blue
  }

  g <- g +
        geom_point(data = dat, aes(color = y, shape = support), size = 2, show.legend = FALSE) +
        scale_color_manual(values = cols) +
        scale_shape_manual(values = shapes) +
        scale_x_continuous(expand = c(0, 0)) +
        scale_y_continuous(expand = c(0, 0)) +
        guides(fill = guide_legend(override.aes = list(alpha = 1))) +
        theme_bw() +
        ggtitle('SVM Classification Plot') +
        theme(plot.title = element_text(hjust = 0.5))
  g
}

# Return vector of the names of numeric columns in a data frame:
numeric_cols <- function(df) {
  is_numeric_cols <- map_lgl(df, is.numeric)
  return(names(df)[is_numeric_cols])
}
```

## Exercise 4

*Generate a simulated two-class data set with 100 observations and two features in which there is a visible but non-linear separation between the two classes. Show that in this setting, a support vector machine with a polynomial kernel (with degree greater than 1) or a radial kernel will outperform a support vector classifier on the training data. Which technique performs best on the test data? Make plots and report training and test error rates in order to back up your assertions.*

Generate data:
```{r}
# Prelims:
set.seed(1)

# Data:
x1 <- rnorm(100)
x2 <- 4 * x1^2 + 1 + rnorm(100)

class <- sample(100, 50)
x2[class] <- x2[class] + 3
x2[-class] <- x2[-class] - 3

y <- rep(-1, 100)
y[class] <- 1

dat <- data.frame(x1 = x1,
                  x2 = x2,
                  y = as.factor(y))

# Errors:
error_train <- c()
error_test <- c()

# Train/Test
train <- sample(100, 80)
test = -train

# Plots:
ggplot(dat) +
  geom_point(aes(x1, x2, group = y, color = y)) +
  theme_bw()
```

### Support Vector Classifier  

Train a support vector classifier (linear kernel) on a range of values for `cost`:
```{r}
set.seed(1)
tune.out_4_svc <- tune(svm, y ~ ., data = dat[train, ], kernel = "linear", ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
summary(tune.out_4_svc)
```

Extract the best model and print summary:
```{r}
bestmod_4_svc <- tune.out_4_svc$best.model
summary(bestmod_4_svc)
```

Plot the decision boundary:
```{r}
#plot(bestmod_4_svc, dat[train, ])
svm_plot(bestmod_4_svc, dat[train, ])
```

The linear classifier fails to accurately capture the non-linearity of the decision boundary.

#### Training error
```{r}
mean(dat[train, "y"] != predict(bestmod_4_svc, newdata = dat[train, ]))
svc_error_train <- mean(summary(bestmod_4_svc)$fitted != y[train])
error_train <- c(error_train, "svc" = svc_error_train)
svc_error_train
```

#### Test Error

```{r}
ypred <- predict(bestmod_4_svc, dat[test, ])
table(predict = ypred, truth = y[test])
```

```{r}
svc_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svc" = svc_error_test)
svc_error_test
```

### Support Vector Machine - Polynomial Kernel ($d$ = 2)

Train a support vector machine (polynomial kernel, $d$ = 2) on a range of values for `cost`:
```{r}
set.seed(1)
tune.out_4_svm <- tune(svm, y ~ ., data = dat[train, ], kernel = "polynomial", degree = 2, ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
summary(tune.out_4_svm)
```

Extract the best model and print summary:
```{r}
bestmod_4_svm <- tune.out_4_svm$best.model
summary(bestmod_4_svm)
```

Plot the decision boundary:
```{r}
#plot(bestmod_4_svm, dat[train, ])
svm_plot(bestmod_4_svm, dat[train, ])
```

The degree-2 polynomial kernel captures some of the non-linearity of the decision boundary, but there are many errors.

#### Training error
```{r}
#mean(dat[train, "y"] != predict(bestmod_4_svm, newdata = dat[train, ]))
svm_error_train <- mean(summary(bestmod_4_svm)$fitted != y[train])
error_train <- c(error_train, "svm (d = 2)" = svm_error_train)
svm_error_train
```

#### Test Error

```{r}
ypred <- predict(bestmod_4_svm, dat[test, ])
table(predict = ypred, truth = dat[test, "y"])
```

```{r}
svm_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svm (d = 2)" = svm_error_test)
svm_error_test
```

### Support Vector Machine - Polynomial Kernel ($d$ = 3)

Train a support vector machine (polynomial kernel, $d$ = 3) on a range of values for `cost`:
```{r}
set.seed(1)
tune.out_4_svm_2 <- tune(svm, y ~ ., data = dat[train, ], kernel = "polynomial", degree = 3, ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
summary(tune.out_4_svm_2)
```

Extract the best model and print summary:
```{r}
bestmod_4_svm_2 <- tune.out_4_svm_2$best.model
summary(bestmod_4_svm_2)
```

Plot the decision boundary:
```{r}
#plot(bestmod_4_svm_2, dat[train, ])
svm_plot(bestmod_4_svm_2, dat[train, ])
```

The degree-3 polynomial kernel captures some of the non-linearity of the decision boundary, but there many errors. It seems that a polynomial of degree 3 is worse than that of degree 2.

### Training error  
```{r}
mean(dat[train, "y"] != predict(bestmod_4_svm_2, newdata = dat[train, ]))
svm_2_error_train <- mean(summary(bestmod_4_svm_2)$fitted != y[train])
error_train <- c(error_train, "svm (d = 3)" = svm_2_error_train)
svm_2_error_train
```

### Test Error  

```{r}
ypred <- predict(bestmod_4_svm_2, dat[test, ])
table(predict = ypred, truth = dat[test, "y"])
```

```{r}
svm_2_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svm (d = 3)" = svm_2_error_test)
svm_2_error_test
```

### Support Vector Machine - Radial Kernel

Train a support vector machine with a radial kernel on a range of values for `cost` and `gamma`:
```{r}
set.seed(1)
tune.out_4_svm_rad <- tune(svm, y ~ ., data = dat[train, ], kernel = "radial", ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100), gamma = c(0.5, 1, 2, 3, 4)))
summary(tune.out_4_svm_rad)
```

Extract the best model and print summary:
```{r}
bestmod_4_svm_rad <- tune.out_4_svm_rad$best.model
summary(bestmod_4_svm_rad)
```

Plot the decision boundary:
```{r}
#plot(bestmod_4_svm_rad, dat[train, ])
svm_plot(bestmod_4_svm_rad, dat[train, ])
```

The radial kernel has very accurately captured the decision boundary. In fact there are no errors.

#### Training error  
```{r}
#mean(dat[train, "y"] != predict(bestmod_4_svm_rad, newdata = dat[train, ]))
svm_rad_error_train <- mean(summary(bestmod_4_svm_rad)$fitted != y[train])
error_train <- c(error_train, "svm (radial)" = svm_rad_error_train)
svm_rad_error_train
```

#### Test Error  

```{r}
ypred <- predict(bestmod_4_svm_rad, dat[test, ])
table(predict = ypred, truth = dat[test, "y"])
```

```{r}
svm_rad_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svm (radial)" = svm_rad_error_test)
svm_rad_error_test
```

### Summary of Errors

```{r}
sort(error_train)
sort(error_test)
```

The SVM with radial kernel has the best training and test errors. However, the linear support vector classifier outperformed both of the SVM's with polynomial kernels.

Redo the analysis with a more complex non-linear decision boundary to see if the polynomial kernels outperform the linear kernel:
```{r}
# Prelims:
set.seed(1)

# Data:
x1 <- rnorm(100)
x2 <- 4 * x1^2 + 1 + rnorm(100)

class <- sample(100, 50)
x2[class[1:25]] <- x2[class[1:25]] + 3
x2[class[25:50]] <- x2[class[25:50]] - 10
x2[-class] <- x2[-class] - 3

y <- rep(-1, 100)
y[class] <- 1

dat <- data.frame(x1 = x1,
                  x2 = x2,
                  y = as.factor(y))

# Errors:
error_train <- c()
error_test <- c()

# Train/Test
train <- sample(100, 80)
test = -train

# Plots:
ggplot(dat) +
  geom_point(aes(x1, x2, group = y, color = y)) +
  theme_bw()
```

### Support Vector Classifier  
```{r}
set.seed(1)
tune.out_4_svc <- tune(svm, y ~ ., data = dat[train, ], kernel = "linear", ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
bestmod_4_svc <- tune.out_4_svc$best.model
#plot(bestmod_4_svc, dat[train, ])
svm_plot(bestmod_4_svc, dat[train, ])
```

The linear classifier fails to capture any of the non-linear decision boundary.

#### Training error  
```{r}
svc_error_train <- mean(summary(bestmod_4_svc)$fitted != y[train])
error_train <- c(error_train, "svc" = svc_error_train)
```

#### Test Error  
```{r}
ypred <- predict(bestmod_4_svc, dat[test, ])
table(predict = ypred, truth = y[test])
svc_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svc" = svc_error_test)
```


### Support Vector Machine - Polynomial Kernel ($d$ = 2)

```{r}
set.seed(1)
tune.out_4_svm <- tune(svm, y ~ ., data = dat[train, ], kernel = "polynomial", degree = 2, ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
bestmod_4_svm <- tune.out_4_svm$best.model
#plot(bestmod_4_svm, dat[train, ])
svm_plot(bestmod_4_svm, dat[train, ])
```

The degree-2 polynomial kernel captures some of the non-linearity of the decision boundary, but there many errors.

#### Training error
```{r}
#mean(dat[train, "y"] != predict(bestmod_4_svm, newdata = dat[train, ]))
svm_error_train <- mean(summary(bestmod_4_svm)$fitted != y[train])
error_train <- c(error_train, "svm (d = 2)" = svm_error_train)
```

#### Test Error

```{r}
ypred <- predict(bestmod_4_svm, dat[test, ])
table(predict = ypred, truth = dat[test, "y"])
svm_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svm (d = 2)" = svm_error_test)
```

### Support Vector Machine (Polynomial Kernel, $d$ = 3)

```{r}
set.seed(1)
tune.out_4_svm_2 <- tune(svm, y ~ ., data = dat[train, ], kernel = "polynomial", degree = 3, ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
bestmod_4_svm_2 <- tune.out_4_svm_2$best.model
#plot(bestmod_4_svm_2, dat[train, ])
svm_plot(bestmod_4_svm_2, dat[train, ])
```

The degree-3 polynomial kernel captures some of the non-linearity of the decision boundary, but there many errors.

#### Training error  
```{r}
svm_2_error_train <- mean(summary(bestmod_4_svm_2)$fitted != y[train])
error_train <- c(error_train, "svm (d = 3)" = svm_2_error_train)
```

#### Test Error  

```{r}
ypred <- predict(bestmod_4_svm_2, dat[test, ])
svm_2_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svm (d = 3)" = svm_2_error_test)
```

### Support Vector Machine - Radial Kernel  

```{r}
set.seed(1)
tune.out_4_svm_rad <- tune(svm, y ~ ., data = dat[train, ], kernel = "radial", ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100), gamma = c(0.5, 1, 2, 3, 4)))
bestmod_4_svm_rad <- tune.out_4_svm_rad$best.model
#plot(bestmod_4_svm_rad, dat[train, ])
svm_plot(bestmod_4_svm_rad, dat[train, ])
```

#### Training error  
```{r}
svm_rad_error_train <- mean(summary(bestmod_4_svm_rad)$fitted != y[train])
error_train <- c(error_train, "svm (radial)" = svm_rad_error_train)
```

#### Test Error  

```{r}
ypred <- predict(bestmod_4_svm_rad, dat[test, ])
table(predict = ypred, truth = dat[test, "y"])
svm_rad_error_test <- mean(ypred != y[test])
error_test <- c(error_test, "svm (radial)" = svm_rad_error_test)
```

### Summary of Errors  

```{r}
sort(error_train)
sort(error_test)
```

The SVM with radial kernel has the best training and test errors. This time, with a more complex decision boundary, both of the SVM's with polynomial kernels outperformed the linear support vector classifier.

* * *

## Exercise 5

### &nbsp;&nbsp;(a)  

Generate a data set with $n = 500$ and $p = 2$, such that the observations belong to two classes with a quadratic decision boundary between them:
```{r}
set.seed(1)
x1 <- runif(500) - 0.5
x2 <- runif(500) - 0.5
y <- 1*((x1^2 - x2^2) > 0)
df <- data.frame(x1, x2, y = as.factor(y)) 
```

### &nbsp;&nbsp;(b)  

Plot the observations, colored according to their class labels:
```{r}
g0 <- ggplot(df) +
  geom_point(aes(x1, x2, group = y, color = y)) +
  theme_bw() +
  ggtitle("True Class Labels") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_discrete(name = "True Class")
g0
```

### &nbsp;&nbsp;(c)  

*Use all of the data as the training data*

Fit a logistic regression model to the data, using $X_1$ and $X_2$ as predictors:
```{r}
logreg <- glm(y ~ x1 + x2, data = df, family = "binomial") 
probs <- predict(logreg, newdata = df, type = "response")
preds <- rep(0, 500)
preds[probs > 0.5] <- 1
```

### &nbsp;&nbsp;(d)  

Apply this model to the training data in order to obtain a predicted class label for each training observation. Plot the observations, colored according to the predicted class labels. The decision boundary should be linear:
```{r fig.width=10}
df$logreg_preds <- as.factor(preds)
g1 <- ggplot(df) +
  geom_point(aes(x1, x2, group = logreg_preds, color = logreg_preds)) +
  theme_bw() +
  scale_color_discrete(name = "Pred. Class") + 
  ggtitle("Logistic Regression: y ~ x1 + x2") +
  theme(plot.title = element_text(hjust = 0.5))
cowplot::plot_grid(g0, g1, nrow = 1)
```

### &nbsp;&nbsp;(e)  

Now fit a logistic regression model to the data using non-linear functions of $X_1$ and $X_2$ as predictors:
```{r message=FALSE, warning=FALSE}
formulas <- c("y ~ I(x1^2) + I(x2^2)",
              "y ~ I(x1^0.5) + I(x2^0.5)",
              "y ~ x1 + x2 + I(x1^2) + I(x2^2)",
              "y ~ x1 + x2 + I(x1^0.5) + I(x2^0.5)",
              "y ~ x1 + x2 + I(x1^2) + I(x2^2) + I(x1 * x2)",
              "y ~ x1 + x2 + I(x1^0.5) + I(x2^0.5) + I(x1 * x2)"
)

logregs <- map(formulas, ~glm(as.formula(.x), data = df, family = "binomial"))
probs <- map(logregs, ~predict(.x, newdata = df, type = "response"))
preds <- vector("list", length(formulas))
preds <- map(1:length(formulas), function(x) preds[[x]] = rep(0, 500))
for(i in 1:length(formulas)) {
  preds[[i]][probs[[i]] > 0.5] <- 1
}
```

### &nbsp;&nbsp;(f)   

With the models involving the squares of the predictors the predicted results look quite close to the true values. The models involving the square roots do not predict the classes well:
```{r fig.height=12, fig.width=10}
G2 <- vector("list", length(formulas))
for(i in 1:length(formulas)) {
  df_ <- data.frame(x1, x2, preds = as.factor(preds[[i]]))
  G2[[i]] <- ggplot(df_) +
                  geom_point(aes(x1, x2,
                                 group = preds,
                                 color = preds)) +
                  ggtitle(label = paste0(formulas[i])) +
                  theme_bw() +
                  theme(plot.title = element_text(hjust = 0.5)) +
    scale_color_discrete(name = "Pred. Class")
}

cowplot::plot_grid(plotlist = G2, nrow = 3)
```

### &nbsp;&nbsp;(g)  

Fit a support vector classifier to the data with $X_1$ and $X_2$ as predictors:
```{r fig.width=10}
svc.out <- tune(svm, y ~ ., data = df, kernel = "linear", ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
svc <- svc.out$best.model
preds <- predict(svc, df)
df_ <- data.frame(x1, x2, preds = as.factor(preds))
g3 <- ggplot(df_) +
  geom_point(aes(x1, x2,
             group = preds,
             color = preds)) +
      ggtitle("Support Vector Classifier") +
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5)) +
      scale_color_discrete(name = "Pred. Class")
cowplot::plot_grid(g0, g3, nrow = 1)
```

### &nbsp;&nbsp;(h)  

Fit a SVM using a non-linear kernel to the data.

#### Support vector machine with polynomial kernel, $d$ = 2  
```{r fig.width=10}
svm.out <- tune(svm, y ~ ., data = df, kernel = "polynomial", degree = 2, ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100)))
svm1 <- svm.out$best.model
preds <- predict(svm1, df)
df_ <- data.frame(x1, x2, preds = as.factor(preds))
g4 <- ggplot(df_) +
  geom_point(aes(x1, x2,
             group = preds,
             color = preds)) +
      ggtitle("Support Vector Machine - Polynomial Kernel (d = 2)") +
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5, size = 10)) +
      scale_color_discrete(name = "Pred. Class")
cowplot::plot_grid(g0, g4, nrow = 1)
```

#### Support vector machine with radial kernel

```{r fig.width=10}
svm.out2 <- tune(svm, y ~ ., data = df, kernel = "radial", degree = 2, ranges = list(cost = c (0.001, 0.01, 0.1, 1, 5, 10, 100), gamma = c(0.5, 1, 2, 3, 4)))
svm_rad <- svm.out$best.model
preds <- predict(svm_rad, df)
df_ <- data.frame(x1, x2, preds = as.factor(preds))
g5 <- ggplot(df_) +
  geom_point(aes(x1, x2,
             group = preds,
             color = preds)) +
      ggtitle("Support Vector Machine - Radial Kernel") +
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5, size = 12)) +
      scale_color_discrete(name = "Pred. Class")
cowplot::plot_grid(g0, g5, nrow = 1)
```

### &nbsp;&nbsp;(i)  

The logistic regression model with linear combination of the variables and the support vector classifier both performed poorly. The logistic regression model with non-linear transformations of the variables, as well as support vector machines with polynomial and radial kernels all performed well. 

```{r fig.height=12, fig.width=10}
g2 <- G2[[1]] + ggtitle("Logistic Regression: y ~ I(x1^2) + I(x2^2)")
cowplot::plot_grid(g0, g1, g2, g3, g4, g5, nrow = 3)
```

* * *

## Exercise 6

It's hard to figure out exactly what the best type of data is for this question. Most of the data sets tried result in zero test error as the cost increases. Using the below data set with a modest $n$ of 100 seems to work for what the question asks, i.e. the test error gets to a minimum before rising again as the cost increases.

### &nbsp;&nbsp;(a)  
```{r}
# Prelim:
set.seed(1)

# Data:
n <- 100
x1 <- runif(n, max = 0.3, min = -0.3)
x2 <- rnorm(n, sd = 1)
y <- ifelse(-0.2 * x1 + x2 > 0, 1, -1)
df <- data.frame(x1, x2, y = as.factor(y))

# Train/Test:
train <- sample(n, 4*n/5)
test <- -train

# Plot:
ggplot(df) +
  geom_point(aes(x1, x2, group = y, color = y)) +
  theme_bw() +
  ggtitle("True Class Labels") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_discrete(name = "True Class", breaks = c(1, -1))
```

### &nbsp;&nbsp;(b) and (c)

Generate support vector classifiers via cross validation using a range of values for `cost` (use all of the data for CV, then we will use the training/test splits in the next section):
```{r}
costs <- c (0.001, 0.01, 0.1, 1, 5, 10, 100, 1e3, 1e4, 1e5)
svc.out <- tune(svm, y ~ ., data = df, kernel = "linear", ranges = list(cost = costs))
svc.cv.error <- svc.out$performances$error # extract the CV errors
svc.cv.error
```

The training and test errors can be obtained by training SVC models using a range of values for `cost` and then making predictions on the training and test data respectively:
```{r}
svc.train.error <- vector("double", length(costs))
svc.test.error <- vector("double", length(costs))
G <- vector("list", length(costs))
for(i in 1:length(costs)) {
  svc <- svm(y ~ ., data = df[train, ], kernel = "linear", cost = costs[i])
  G[[i]] <- svm_plot(svc, df[train, ]) + ggtitle(str_c("cost: ", costs[i]))
  
  # Training Error:
  preds <- predict(svc, df[train, ])
  train.error <- mean(preds != y[train])
  svc.train.error[i] <- train.error
  
  # Test Error:
  preds <- predict(svc, df[test, ])
  test.error <- mean(preds != y[test])
  svc.test.error[i] <- test.error
}

svc.train.error
svc.test.error
```

```{r fig.height=12, fig.width=10}
plot_grid(plotlist = G, nrow = 5)
```

Plot the CV, training, and test errors:
```{r}
library(latex2exp)
svc.errors <- data.frame(cost = log10(costs),
                         cv.error = svc.cv.error,
                         train.error = svc.train.error,
                         test.error = svc.test.error)

# see https://stackoverflow.com/a/50496899 and https://stackoverflow.com/a/10355844 for making legend
ggplot(svc.errors) +
  geom_line(aes(cost, cv.error, color = "CV")) +
  geom_line(aes(cost, train.error, color = "Train")) +
  geom_line(aes(cost, test.error, color = "Test")) +
  geom_hline(aes(yintercept = min(svc.test.error)), color = "grey30", linetype = "dashed") +
  scale_color_manual(name = "Error Rate", breaks = c("Test", "Train", "CV"), values = c("Test" = "#00BA38", "CV" = "#F8766D", "Train" = "#619CFF")) +
  #scale_color_discrete(name = NULL, labels = c("CV Error", "Test Error", "Training Error")) +
  labs(title = "Comparison of CV, Training and Test Error Rates",
       x = TeX("Log_{10}(Cost)"),
       y = "Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = c(0.97, 0.95), legend.justification = c(1, 1))
```

### &nbsp;&nbsp;(d)  

The error rates approach zero as the cost increases from small values towards 1. Then, the test error starts to rise again, suggesting that a lower value for cost is better, even though the training error is zero for larger values.

* * *

## Exercise 7

View data:
```{r}
Auto
```

Summary:
```{r}
summary(Auto)
```

```{r}
Auto$origin <- as.factor(Auto$origin)
Auto$year <- factor(Auto$year, ordered = TRUE)
```

```{r}
head(Auto)
```

### &nbsp;&nbsp;(a)  

Create a binary variable that takes on a 1 for cars with gas mileage above the median, and a 0 for cars with gas mileage below the median:
```{r}
Auto$med01 <- as.factor(Auto$mpg > median(Auto$mpg))
Auto <- Auto[ , -1]
summary(Auto$med01)
```

### &nbsp;&nbsp;(b)  

Fit a support vector classifier to the data with various values of `cost`, in order to predict whether a car gets high or low gas mileage:
```{r}
set.seed(1)
costs <- c (0.001, 0.01, 0.1, 1, 5, 10, 50, 100, 250, 500, 1e3, 5e3, 1e4, 5e4, 1e5)
svc.out <- tune(svm, med01 ~ ., data = Auto, kernel = "linear", ranges = list(cost = costs))
svc.cv.error <- svc.out$performances$error # extract the CV errors
svc.cv.error
```

Plot CV errors:
```{r}
ggplot(data.frame(cost = log10(costs), cv.error = svc.cv.error)) +
  geom_line(aes(cost, cv.error), color = ISLR_orange) +
  labs(title = "CV Error Rate as Function of Cost",
       x = TeX("Log_{10}(Cost)"),
       y = "CV Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))
```

### &nbsp;&nbsp;(c)  

Now repeat (b), this time using SVMs with radial and polynomial basis kernels, with different values of `gamma` and `degree` and `cost`.

#### Polynomial Kernel

```{r}
set.seed(1)
costs <- c (0.001, 0.01, 0.1, 1, 5, 10, 50, 100, 250, 500, 1e3, 5e3, 1e4, 5e4, 1e5)
degrees <- c(2, 3, 4, 5)
svc.out.poly <- tune(svm, med01 ~ ., data = Auto, kernel = "polynomial", ranges = list(degree = degrees, cost = costs)) # first variable in ranges is the rows in below matrix
svc.poly.cv.error <- svc.out.poly$performances$error # extract the CV errors
round(matrix(svc.poly.cv.error, nrow = length(degrees)), 3)
```

Plot CV errors:
```{r fig.height=4, fig.width=8}
dat_ <- data.frame(cbind(cost = log10(costs), t(matrix(svc.poly.cv.error, nrow = length(degrees))))) %>% 
        gather(key = "degree", value = "cv.error", V2:V5) %>% 
        mutate(degree = factor(str_replace(degree, "V", "degree: ")))

ggplot(dat_) +
  geom_line(aes(cost, cv.error, color = degree)) +
  labs(title = "Polynomial Kernel - CV Error Rate as Function of Cost",
       x = TeX("Log_{10}(Cost)"),
       y = "CV Error Rate") +
  ggsci::scale_color_d3() +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))
```

#### Radial Kernel

```{r}
set.seed(1)
costs <- c(0.001, 0.01, 0.1, 1, 5, 10, 50, 100, 250, 500, 1e3, 5e3, 1e4, 5e4, 1e5)
gammas = c(0.01, 0.05, 0.1, 0.5)
svc.out.rad <- tune(svm, med01 ~ ., data = Auto, kernel = "polynomial", ranges = list(gamma = gammas, cost = costs)) # first variable in ranges is the rows in below matrix
svc.rad.cv.error <- svc.out.rad$performances$error # extract the CV errors
round(matrix(svc.rad.cv.error, nrow = length(gammas)), 3)
```

Plot CV errors:
```{r fig.height=4, fig.width=8}
dat_ <- data.frame(cbind(cost = log10(costs), t(matrix(svc.rad.cv.error, nrow = length(gammas))))) %>% 
        gather(key = "gamma", value = "cv.error", V2:V5) %>% 
        mutate(gamma = str_c("gamma: ", factor(gammas[as.numeric(str_remove(gamma, "V")) - 1]))) # factor to make it a discrete var in ggplot()

ggplot(dat_) +
  geom_line(aes(cost, cv.error, color = gamma)) +
  labs(title = "Radial Kernel - CV Error Rate as Function of Cost",
       x = TeX("Log_{10}(Cost)"),
       y = "CV Error Rate") +
  ggsci::scale_color_d3() +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))
```

### &nbsp;&nbsp;(d)  

A support vector machine with a linear kernel and `cost = 0.1` performs well, with a CV error of 0.087

A support vector machine with a polynomial kernel of `degree` three and also achieves a low CV error of 0.08. However the `cost` is very large at 100,000.

A support vector machine with a radial kernel and parameters of `cost = 1` and `gamma = 0.5` has the lowest CV error at 0.089. Other radial kernel SVM's with very large `cost` values also have an equally low CV error, but a `gamma` value of 0.5 performs well with lower `cost` which would likely overfit less than the other models.

* * *

## Exercise 8

View data:
```{r}
head(OJ)
```

Summary:
```{r}
summary(OJ)
```

### &nbsp;&nbsp;(a)  

Create a training set containing a random sample of 800 observations, and a test set containing the remaining observations:
```{r}
# Prelims:
set.seed(1)

# Data:
df <- OJ

# Train/Test:
train <- sample(1:nrow(OJ), 800)
test <- -train
```

### &nbsp;&nbsp;(b)  

Fit a support vector classifier to the training data using `cost = 0.01`, with `Purchase` as the response and the other variables as predictors:
```{r}
svc_8 <- svm(Purchase ~ . , data = df[train, ], kernel = "linear", cost = 0.01)
summary(svc_8)
```

### &nbsp;&nbsp;(c)  

Training error:
```{r}
svc_8.train.error <- mean(predict(svc_8, df[train, ]) != df$Purchase[train])
svc_8.train.error
```

Test error:
```{r}
svc_8.test.error <- mean(predict(svc_8, df[test, ]) != df$Purchase[test])
svc_8.test.error
```

### &nbsp;&nbsp;(d)  

Use the tune() function to select an optimal `cost.` Consider values in the range 0.01 to 10.
```{r}
set.seed(1)
costs <- c (0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10)
svc.out_8 <- tune(svm, Purchase ~ ., data = df[train, ], kernel = "linear", ranges = list(cost = costs))
svc_8.cv.error <- svc.out_8$performances$error # extract the CV errors
svc_8.cv.error
```

Plot:
```{r}
ggplot(data.frame(cost = log10(costs), cv.error = svc_8.cv.error)) +
  geom_line(aes(cost, cv.error), color = ISLR_orange) +
  labs(title = "Linear Kernel - CV Error Rate as Function of Cost",
       x = TeX("Log_{10}(Cost)"),
       y = "CV Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))
```

### &nbsp;&nbsp;(e)  

The training and test errors can be obtained by training SVC models using a range of values for `cost` and then making predictions on the training and test data respectively:
```{r}
set.seed(1)
svc_8.train.error <- vector("double", length(costs))
svc_8.test.error <- vector("double", length(costs))
for(i in 1:length(costs)) {
  svc <- svm(Purchase ~ ., data = df[train, ], kernel = "linear", cost = costs[i])
  
  # Training Error:
  preds <- predict(svc, df[train, ])
  train.error <- mean(preds != df$Purchase[train])
  svc_8.train.error[i] <- train.error
  
  # Test Error:
  preds <- predict(svc, df[test, ])
  test.error <- mean(preds != df$Purchase[test])
  svc_8.test.error[i] <- test.error
}

svc_8.train.error
svc_8.test.error
```

Plot the CV, training, and test errors:
```{r}
library(latex2exp)
svc_8.errors <- data.frame(cost = log10(costs),
                         cv.error = svc_8.cv.error,
                         train.error = svc_8.train.error,
                         test.error = svc_8.test.error)

# see https://stackoverflow.com/a/50496899 and https://stackoverflow.com/a/10355844 for making legend
g1 <- ggplot(svc_8.errors) +
  geom_line(aes(cost, cv.error, color = "CV")) +
  geom_line(aes(cost, train.error, color = "Train")) +
  geom_line(aes(cost, test.error, color = "Test")) +
  scale_color_manual(name = "Error Rate", breaks = c("Test", "Train", "CV"), values = c("Test" = "#00BA38", "CV" = "#F8766D", "Train" = "#619CFF")) +
  #scale_color_discrete(name = NULL, labels = c("CV Error", "Test Error", "Training Error")) +
  labs(title = "Linear Kernel - Comparison of CV, Training and Test Error Rates",
       x = TeX("Log_{10}(Cost)"),
       y = "Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = c(0.97, 0.95), legend.justification = c(1, 1))
g1
```

The training error seems to be higher than the test error.

### &nbsp;&nbsp;(f)  

#### Radial Kernel  

```{r}
set.seed(1)
svm_8_rad <- svm(Purchase ~ . , data = df[train, ], kernel = "radial", cost = 0.01)
summary(svm_8_rad)
```

Training error:
```{r}
svm_8_rad.train.error <- mean(predict(svm_8_rad, df[train, ]) != df$Purchase[train])
svm_8_rad.train.error
```

Test error:
```{r}
svm_8_rad.test.error <- mean(predict(svm_8_rad, df[test, ]) != df$Purchase[test])
svm_8_rad.test.error
```

Use CV:
```{r}
set.seed(1)
svm.out_8_rad <- tune(svm, Purchase ~ ., data = df[train, ], kernel = "radial", ranges = list(cost = costs))
svm_8_rad.cv.error <- svm.out_8_rad$performances$error # extract the CV errors
svm_8_rad.cv.error
```

Plot CV error:
```{r}
ggplot(data.frame(cost = log10(costs), cv.error = svm_8_rad.cv.error)) +
  geom_line(aes(cost, cv.error), color = ISLR_orange) +
  labs(title = "Radial Kernel - CV Error Rate as Function of Cost",
       x = TeX("Log_{10}(Cost)"),
       y = "CV Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))
```

The training and test errors can be obtained by training SVM models using a range of values for `cost` and then making predictions on the training and test data respectively:
```{r}
set.seed(1)
svm_8_rad.train.error <- vector("double", length(costs))
svm_8_rad.test.error <- vector("double", length(costs))
for(i in 1:length(costs)) {
  svm_8 <- svm(Purchase ~ ., data = df[train, ], kernel = "radial", cost = costs[i])
  
  # Training Error:
  preds <- predict(svm_8, df[train, ])
  train.error <- mean(preds != df$Purchase[train])
  svm_8_rad.train.error[i] <- train.error
  
  # Test Error:
  preds <- predict(svm_8, df[test, ])
  test.error <- mean(preds != df$Purchase[test])
  svm_8_rad.test.error[i] <- test.error
}

svm_8_rad.train.error
svm_8_rad.test.error
```

Plot the CV, training, and test errors:
```{r}
library(latex2exp)
svm_8_rad.errors <- data.frame(cost = log10(costs),
                         cv.error = svm_8_rad.cv.error,
                         train.error = svm_8_rad.train.error,
                         test.error = svm_8_rad.test.error)

# see https://stackoverflow.com/a/50496899 and https://stackoverflow.com/a/10355844 for making legend
g2 <- ggplot(svm_8_rad.errors) +
  geom_line(aes(cost, cv.error, color = "CV")) +
  geom_line(aes(cost, train.error, color = "Train")) +
  geom_line(aes(cost, test.error, color = "Test")) +
  scale_color_manual(name = "Error Rate", breaks = c("Test", "Train", "CV"), values = c("Test" = "#00BA38", "CV" = "#F8766D", "Train" = "#619CFF")) +
  #scale_color_discrete(name = NULL, labels = c("CV Error", "Test Error", "Training Error")) +
  labs(title = "Radial Kernel - Comparison of CV, Training and Test Error Rates",
       x = TeX("Log_{10}(Cost)"),
       y = "Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = c(0.97, 0.95), legend.justification = c(1, 1))
g2
```

With the radial kernel, the training error is lower than the test error.

### &nbsp;&nbsp;(g)  

#### Polynomial Kernel  

```{r}
set.seed(1)
svm_8_poly <- svm(Purchase ~ . , data = df[train, ], kernel = "polynomial", degree = 2, cost = 0.01)
summary(svm_8_poly)
```

Training error:
```{r}
svm_8_poly.train.error <- mean(predict(svm_8_poly, df[train, ]) != df$Purchase[train])
svm_8_poly.train.error
```

Test error:
```{r}
svm_8_poly.test.error <- mean(predict(svm_8_poly, df[test, ]) != df$Purchase[test])
svm_8_poly.test.error
```

Use CV:
```{r}
set.seed(1)
svm.out_8_poly <- tune(svm, Purchase ~ ., data = df[train, ], kernel = "polynomial", degree = 2, ranges = list(cost = costs))
             #svm(Purchase ~ . , data = df[train, ], kernel = "polynomial", degree = 2, cost = 0.01)
svm_8_poly.cv.error <- svm.out_8_poly$performances$error # extract the CV errors
svm_8_poly.cv.error
```

Plot CV error:
```{r}
ggplot(data.frame(cost = log10(costs), cv.error = svm_8_poly.cv.error)) +
  geom_line(aes(cost, cv.error), color = ISLR_orange) +
  labs(title = "Polynomial Kernel (d = 2) - CV Error Rate as Function of Cost",
       x = TeX("Log_{10}(Cost)"),
       y = "CV Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))
```

The training and test errors can be obtained by training SVC models using a range of values for `cost` and then making predictions on the training and test data respectively:
```{r}
set.seed(1)
svm_8_poly.train.error <- vector("double", length(costs))
svm_8_poly.test.error <- vector("double", length(costs))
for(i in 1:length(costs)) {
  svm_8 <- svm(Purchase ~ ., data = df[train, ], kernel = "polynomial", degree = 2, cost = costs[i])
  
  # Training Error:
  preds <- predict(svm_8, df[train, ])
  train.error <- mean(preds != df$Purchase[train])
  svm_8_poly.train.error[i] <- train.error
  
  # Test Error:
  preds <- predict(svm_8, df[test, ])
  test.error <- mean(preds != df$Purchase[test])
  svm_8_poly.test.error[i] <- test.error
}

svm_8_poly.train.error
svm_8_poly.test.error
```

Plot the CV, training, and test errors:
```{r}
library(latex2exp)
svm_8_poly.errors <- data.frame(cost = log10(costs),
                         cv.error = svm_8_poly.cv.error,
                         train.error = svm_8_poly.train.error,
                         test.error = svm_8_poly.test.error)

# see https://stackoverflow.com/a/50496899 and https://stackoverflow.com/a/10355844 for making legend
g3 <- ggplot(svm_8_poly.errors) +
  geom_line(aes(cost, cv.error, color = "CV")) +
  geom_line(aes(cost, train.error, color = "Train")) +
  geom_line(aes(cost, test.error, color = "Test")) +
  scale_color_manual(name = "Error Rate", breaks = c("Test", "Train", "CV"), values = c("Test" = "#00BA38", "CV" = "#F8766D", "Train" = "#619CFF")) +
  #scale_color_discrete(name = NULL, labels = c("CV Error", "Test Error", "Training Error")) +
  labs(title = "Polynomial Kernel (d = 2) - Comparison of CV, Training and Test Error Rates",
       x = TeX("Log_{10}(Cost)"),
       y = "Error Rate") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5),
        legend.position = c(0.97, 0.95), legend.justification = c(1, 1))
g3
```

With the radial kernel, the training error is lower than the test error.

### &nbsp;&nbsp;(h)  

Comparison of three kernels:
```{r fig.width=10, fig.height=4.5}
plot_grid(plotlist = list(g1 + ggtitle("Linear") + scale_y_continuous(limits = c(0.14, 0.4), breaks = seq(0.1, 0.4, by = 0.05)),
                          g2 + ggtitle("Radial") + scale_y_continuous(limits = c(0.14, 0.4), breaks = seq(0.1, 0.4, by = 0.05)) + theme(axis.title.y = element_blank()),
                          g3 + ggtitle("Polynomial") + scale_y_continuous(limits = c(0.14, 0.4), breaks = seq(0.1, 0.4, by = 0.05)) + theme(axis.title.y = element_blank())), nrow = 1)
```